### This file injests raw data from lab and mTurk sessions,
## creates common fields across datasets, merges them, 
## and cleans up some variable names  
### outuput is Ahlquist_Ansell_UIskills_cleaned.csv



rm( list=ls() )
set.seed(123)
#********************************* Begin Load Packages *************************************

# Standard Packages:
library(foreign)        # import datasets from other programs like STATA
library(readstata13)    # new CRAN package needed to read stata 13 and 14: read.dta13
library(data.table)     # reading in data
library(plyr)           # implement the split-apply-combine pattern in R
library(dplyr)          # the gold standard for data manipulation, specifically designed for data.frame-like objects
library(magrittr)       # pipe operator “%>%” This operator will forward a value, or the result of an expression, into the next function call/expression. https://www.r-bloggers.com/why-bother-with-magrittr/
library(MASS)

# Data Cleaning Packages
library(reshape2)       # older reshape package
library(tidyr)          # newer reshape wide-long commands compared to reshape2
library(doBy)           # collapse
library(dataMaid)       # codebook compiler 

# Tables and Figures Packages:
library(gridExtra)      # provides side-by-side plotting
library(stargazer)      # LaTeX code for Tables
#library(ggplot2)        # plotting & data
#library(viridis)        # color palettes using scale_fill_viridis() in ggplot

#********************************* End Load Packages *************************************

# setwd("/foo/") # set working directory
# or use here::here()
#-------------[] Load raw data ------------------------------------------------------------------------------------------------------------

lab <- read.csv("CESSraw.csv", fileEncoding="latin1")   # load lab experiment raw data
mturk <- read.csv("AA_19062018_AMT.csv", fileEncoding="latin1")   # load mTurk experiment raw data

#-------------[] Preliminary cleaning ------------------------------------------------------------------------------------------------------------

# drop the 2 top rows with extra column names. 
lab <- lab[3:nrow(lab),]

# rename vars in mturk dataframe to match with lab dataframe
names(lab)
names(mturk) 
colnames(mturk)[colnames(mturk)=="Q211"] <- "Q9"   # sex
colnames(mturk)[colnames(mturk)=="Q212"] <- "Q10"  # age
colnames(mturk)[colnames(mturk)=="Q213"] <- "Q11"   # race
colnames(mturk)[colnames(mturk)=="Q213_7_TEXT"] <- "Q11_5_TEXT"   # race_other
colnames(mturk)[colnames(mturk)=="Q214"] <- "Q12"   # place
colnames(mturk)[colnames(mturk)=="Q215"] <- "Q13"   # politics
colnames(mturk)[colnames(mturk)=="Q216"] <- "Q14"   # vote
colnames(mturk)[colnames(mturk)=="Q217"] <- "Q15"   # party
colnames(mturk)[colnames(mturk)=="Q218"] <- "Q16"   # ideology
colnames(mturk)[colnames(mturk)=="Q219"] <- "Q17"   # educ
colnames(mturk)[colnames(mturk)=="Q220"] <- "Q18"   # emp
colnames(mturk)[colnames(mturk)=="Q221"] <- "Q19"   # htotinc
colnames(mturk)[colnames(mturk)=="Q222"] <- "Q20"   # trust
colnames(mturk)[colnames(mturk)=="Q223"] <- "Q21"   # contact race
colnames(mturk)[colnames(mturk)=="Q224"] <- "Q22"   # friend race
colnames(mturk)[colnames(mturk)=="Q225"] <- "Q23"   # union
colnames(mturk)[colnames(mturk)=="id"] <- "ResponseId"   # id

# 2 lvl factor var for experimental context = "lab" or "mTurk"
lab$context <- "lab"
mturk$context <- "mTurk"
lab$context <- as.factor(lab$context)
mturk$context <- as.factor(mturk$context)

#-------------[] Append the mturk observations to lab ------------------------------------------------------------------------------------------------------------

dat <- rbind.fill(lab, mturk) # appends while entering NA for column names that have no match

#-------------[] Clean Investment DV ------------------------------------------------------

typeof(dat$Q43)
levels(dat$Q43)
dat$investment <- ifelse(dat$Q43=="Invest my account","yes","no")
dat$investment <- as.factor(dat$investment)
dat <- subset(dat, select= -Q43) 
summary(dat$investment)

#-------------[] Clean Waiting1 and Waiting2 DVs ------------------------------------------------------

names(dat) # identify column numbers
dat$id <- vctrs::vec_group_id(dat) # create an easier numeric identifier by subject.

dat_long_rejemp <- gather(dat[c(232,198:212)]   , round, rejemp, rejemp1:rejemp15, factor_key = TRUE) # reshape the rejemp[1-15] var from wide to long
dat_long_rejemp$round <- gsub("[a-z]", "", dat_long_rejemp$round)  # drop all lower case letters from new round var
dat_long_rejemp$round <- as.integer(dat_long_rejemp$round) # convert time var from chr to interger
dat_long_rejemp <- dat_long_rejemp[order(dat_long_rejemp$id,dat_long_rejemp$round),] # sort by id, round

dat_long_employed <- gather(dat[c(232,123:137)]   , round, employed, employed1:employed15, factor_key = TRUE) # reshape the employed[1-15] var from wide to long
dat_long_employed$round <- gsub("[a-z]", "", dat_long_employed$round)  # drop all lower case letters from new round var
dat_long_employed$round <- as.integer(dat_long_employed$round) # convert time var from chr to interger
dat_long_employed <- dat_long_employed[order(dat_long_employed$id,dat_long_employed$round),] # sort by id, round

dat_long_total <- gather(dat[c(232,108:122)]   , round, total, total1:total15, factor_key = TRUE) # reshape the total[1-15] var from wide to long
dat_long_total$round <- gsub("[a-z]", "", dat_long_total$round)  # drop all lower case letters from new round var
dat_long_total$round <- as.integer(dat_long_total$round) # convert time var from chr to interger
dat_long_total <- dat_long_total[order(dat_long_total$id,dat_long_total$round),] # sort by id, round

dat_long <- merge(dat_long_rejemp, dat_long_employed, by=c("id","round"), sort = TRUE, all=FALSE) # merge the two long vars 
dat_long <- merge(dat_long, dat_long_total, by=c("id","round"), sort = TRUE, all=FALSE) # merge the two long vars 
dat_long <- dat_long[order(dat_long$id,dat_long$round),] # sort by id, round
row.names(dat_long) <- NULL # reset numerical row names 
rm(dat_long_employed, dat_long_rejemp, dat_long_total)

dat_long$rejemp <- as.numeric(dat_long$rejemp) # convert from chr to int
dat_long$employed <- as.numeric(dat_long$employed) 
dat_long$total <- as.numeric(dat_long$total) 

dat_long$rejemp <- ifelse(dat_long$rejemp==1 | dat_long$rejemp==2, 1,0)   # convert to binary = 1 if rejected employment in task 1 or task 2. 
dat_long$employed <- ifelse(dat_long$employed==1 | dat_long$employed==2, 1,0)   # convert to binary = 1 if employed in task 1 or task 2. 

## for R2 RR
dat_long<- left_join(dat_long, dat[,c("id", "context", "investment")],
                     by = "id")

eff.dat<-subset(dat_long, employed==1)
eff.dat$total<-as.numeric(eff.dat$total)
max(eff.dat$total)
which(eff.dat$total==42) #ID 307; total outlier on effort
sd(eff.dat$total)
eff.inv.sum<-eff.dat |>   
  group_by(investment, context) |> 
  summarise(av_eff=mean(total, na.rm=TRUE),
            sd_eff = sd(total, na.rm=TRUE),
            max_eff=max(total, na.rm=TRUE))

#


dat_long <- dat_long %>% group_by(id) %>% mutate(tot.rejemp = sum(rejemp))  # gen tot number of times a respondents rejected an offer of reemployment. This will be the numerator for DV: waiting1

dat_long <- dat_long %>% group_by(id) %>% 
  mutate(l.employed = lag(employed)) # lag emp var 

dat_long <- dat_long %>% group_by(id) %>% 
  mutate(d.employed = employed - l.employed)  # create a change in employment variable

dat_long$reemp <- ifelse(dat_long$d.employed>0 ,1,0)  # binary = 1 if subject accepted reemployment; that is, went from emp=0 to emp=1

dat_long <- dat_long %>% group_by(id) %>% 
  mutate(tot.reemp = sum(reemp, na.rm=TRUE))  # sum the number of times a subject accepted reemployment. 

dat_long <- dat_long %>% group_by(id) %>% 
  mutate(offers = tot.reemp + tot.rejemp) # denominator for waiting1: total number of reemployment offers (opportunities) = number of accepted offers + rejected offers, respectively.

dat_long <- dat_long %>% group_by(id) %>% 
  mutate(waiting1 = tot.rejemp/offers) # Waiting DV1. Note many NaNs due to 0/0s. 

# Create waiting2 = 1 if a subject ever rejected reemployment. 
dat_long$waiting2 <- NA
dat_long$waiting2[dat_long$tot.rejemp>0]<-1 
dat_long$waiting2[dat_long$tot.rejemp==0]<-0 


#-------------[] Clean effort1 and effort2 DVs ------------------------------------------------------

# DV: effort measure 1: the total number of task items attempted over the number of rounds played.

# DV: effort measure 2: The total number of accurate responses over the number of rounds played. 
dat_long <- dat_long %>% group_by(id) %>% 
  mutate(total_accurate = sum(total)) 

dat_long$roundsplayed <- ifelse(dat_long$total>0 , 1, 0) 

dat_long <- dat_long %>% group_by(id) %>% 
  mutate(totalroundsplayed = sum(roundsplayed))

dat_long <- dat_long %>% group_by(id) %>%
  mutate(effort2 = total_accurate/totalroundsplayed) 

# bonus DV: maxscore
dat_long <- dat_long %>% group_by(id) %>%
  mutate(maxscore = max(total))

# DV: effort measure 1: the ave number of accurate responses by respoondent i over T rounds.  
# PROBLEM: this DVs require values that vary by i and t, but only the pay vars meet this condition.   
# the two vars that vary by i and t are: total[1-15] or "total items that are correctly answered in rounds [1, 15]" and roundpay[1-15] or "payment for performance in round t".  
# outcomestring[1-15] varies by i, but is constant across t. 
# targetstring[1-15] and valuestring[1-15] is constant across i, but varies across t   


#-------------[] merge new waiting1, waiting2, effort1, and maxscore DVs to main dataframe. ---------------------------------------------------------------------------------

dat_long.agg <- aggregate(cbind(waiting1, waiting2, effort2, maxscore, tot.rejemp, offers) ~ id, data = dat_long, FUN = max, na.action = na.pass) # collapse all observations to the lvl of subjects. 
dat <- merge(dat, dat_long.agg , by=c("id"), sort = TRUE)
rm(dat_long.agg)

# variables encoding experience with unemployment
dat$unemp.exp<-dat$ct.unemp.exp<-NA
for(i in 1:nrow(dat)){
  dat$unemp.exp[i]<-0%in%dat[i,124:138] 
  dat$ct.unemp.exp[i]<-sum(dat[i,124:138]==0)
}



#-------------[] covariates: encode with meaningful names ---------------------------------------------------------------------------------

# factor var to indicate the 6 treatment status variations.  
levels(dat$Treatment)
dat$treat <- "LowNone"
dat$treat[dat$Treatment==5] <- "LowMinimal"
dat$treat[dat$Treatment==6] <- "LowGenerous"
dat$treat[dat$Treatment==3] <- "HiGenerous"
dat$treat[dat$Treatment==2] <- "HiMinimal"
dat$treat[dat$Treatment==1] <- "HiNone"
dat$treat <- as.factor(dat$treat)
summary(dat$treat)

# factor var = "high" if i faces an high unemployment rate of p=0.25 versus a low rate p=0.10. 
dat$unemprate <- "low"
dat$unemprate[dat$Treatment %in% c(1:3)] <- "high"
dat$unemprate <- as.factor(dat$unemprate)
summary(dat$unemprate)

# factor var to indicate the level of unemployment insurance
dat$UI <- "none"
dat$UI[dat$Treatment %in% c(5,2)] <- "minimal"
dat$UI[dat$Treatment %in% c(3,6)] <- "generous"
dat$UI <- as.factor(dat$UI)
summary(dat$UI)

# control vars. rename  
colnames(dat)[colnames(dat)=="Q9"]  <- "sex"
colnames(dat)[colnames(dat)=="Q10"] <- "age"
colnames(dat)[colnames(dat)=="Q11"] <- "race"
colnames(dat)[colnames(dat)=="Q11_5_TEXT"] <- "race_other"
colnames(dat)[colnames(dat)=="Q12"] <- "area_raised"
colnames(dat)[colnames(dat)=="Q13"] <- "politics"
colnames(dat)[colnames(dat)=="Q14"] <- "vote"
colnames(dat)[colnames(dat)=="Q15"] <- "party_vote"
colnames(dat)[colnames(dat)=="Q16"] <- "partyid"
colnames(dat)[colnames(dat)=="Q17"] <- "educ"
colnames(dat)[colnames(dat)=="Q18"] <- "emp"
colnames(dat)[colnames(dat)=="Q19"] <- "htotinc"
colnames(dat)[colnames(dat)=="Q20"] <- "trust"
colnames(dat)[colnames(dat)=="Q21"] <- "race_contact"
colnames(dat)[colnames(dat)=="Q22"] <- "race_friends"
colnames(dat)[colnames(dat)=="Q23"] <- "trade_union"
colnames(dat)[colnames(dat)=="Q37"] <- "risk"
colnames(dat)[colnames(dat)=="Q50"] <- "reemp_offer1"
colnames(dat)[colnames(dat)=="Q51"] <- "reemp_offer2"
colnames(dat)[colnames(dat)=="Duration..in.seconds."] <- "duration"

#-------------[] drop columns that are blank, unused, or irrelavent  ------------------------------------------------------------------------------------------------------------

names(dat) # identify column numbers (have added back in total_correct which refers to practice round)
dat <- subset(dat, select = -c(EndDate, Status, IPAddress, Progress, Finished, RecordedDate, inlab, # drop survey indicators
                               RecipientLastName,	RecipientFirstName,	RecipientEmail,	ExternalReference, LocationLatitude, LocationLongitude, DistributionChannel, UserLanguage, # drop subject vars
                               Q25_1_1,	Q25_2_1,	Q25_3_1,	Q25_4_1,	Q25_5_1,	Q25_6_1,	Q25_7_1,	Q25_8_1,	Q231,	Q226,	Q55,	Q56,	Q57,	Q61,	Q228, # drop general info and quiz vars
                               Q49_First.Click,	Q49_Last.Click,	Q49_Page.Submit,	Q49_Click.Count,
                               reemployed,  employed, roundnum, # drop vars that misleadingly only report values for the final (15th) round
                               109:123, 124:138, 199:213, 215:230)) # drop total[1-15], employed[1-15], rejemp[1-15] and mTurk-specific admin indicators, respectively

names(dat) 
#dat <- as.data.frame(dat[, c(1, 131, 132:140, 2:130)]) # reorder vars for user friendliness. 

#-------------[] purge all factor vars of unused admin levels  ------------------------------------------------------------------------------------------------------------

levels(dat$sex) # example: sex has 5 lvls, but it should just have 3: male, female, other. 
dat <- droplevels.data.frame(dat) # drops unused levels---those with no observations. 
levels(dat$sex) # confirm success


#-------------[] misc. cleaning  ------------------------------------------------------------------------------------------------------------

# make values of ftotinc comparable across experimental contexts
levels(dat$htotinc)

#-------------[] convert vars to appropriate type  ------------------------------------------------------------------------------------------------------------

# note this issue: https://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-integer-numeric-without-loss-of-information 
dat$age <- as.numeric(as.character(dat$age)) 
dat$duration <- as.numeric(as.character(dat$duration))
dat$amount_to_pay <- as.numeric(as.character(dat$amount_to_pay))
dat$payamt1 <- as.numeric(as.character(dat$payamt1))
dat$payamt2 <- as.numeric(as.character(dat$payamt2))
dat$payamt3 <- as.numeric(as.character(dat$payamt3))
dat$ecusubtotal <- as.numeric(as.character(dat$ecusubtotal))
dat$gbpsubtotal <- as.numeric(as.character(dat$gbpsubtotal))
dat$payamtavg <- as.numeric(as.character(dat$payamtavg))
dat$waiting2 <- as.factor(dat$waiting2)
dat$total_correct <- as.numeric(dat$total_correct)

#-------------[] generate codebook  ---------------------------------------------------------------------------------------------------------------
# 
makeDataReport(dat, file = "codebook.Rmd", replace = TRUE)  # Produce data report by DataMaid

#-------------[] returns an analysis-ready .csv data file ---------------------------------------------------------------------------------------------

write.csv(dat, file = "AhlquistAnsell_UIskills_cleaned.csv",row.names=FALSE) 

#-------------[] END ---------------------------------------------------------------------------------------------
